# imports, `Value` class, reset_graph() to init nn, graphviz: trace() & draw_dot(), and PyTorch: `Neuron`, `Layer`, `MLP` classes
 
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
 
# extend `Value` class with the constituent methods listed above:
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        # pre-process `other`. If it is non-`Value`, assume `int`/`float` and wrap in `Value()`
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
 
        def _backward():
            self.grad += out.grad * 1.0
            other.grad += out.grad * 1.0
        out._backward = _backward
        
        return out
 
    def __mul__(self, other):
        # pre-process `other`. If it is non-`Value`, assume int/float and wrap in `Value()`
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
 
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        
        return out
 
    def __radd__(self, other):  # fallback for swapped operands: i.e. other + self
        return self + other     # route to `__add__`
    
    def __rmul__(self, other):  # fallback for swapped operands: i.e. other * self
        return self * other     # route to `__mul__`
    
    # ensure `other` is NEVER a `Value` object. Only int/float allowed
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')
        
        # recall downstream grad = local grad * upstream grad
        # local gradient for x^k: d(x^k)/dx = kx^(k-1)
        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward
        
        return out
    
    def __truediv__(self, other): # i.e. self / other but...
        return self * other**-1   # use previously defined __mul__() and __pow__(), instead of implementing `/` operation and its own `_backward()``
    
    def __neg__(self): # -self
        return self * -1        # use previously defined __mul__() to evaluate this `Value` * `int` expression
    
    def __sub__(self, other):   # self - other
        return self + (-other)  # use previously defined __add__(), instead of implementing `-` operation and its own `_backward()``
 
    def tanh(self): 
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')
 
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        
        return out
    
    # define exponentiation method
    def exp(self):
        x = self.data                               # input data value
        out = Value(math.exp(x), (self, ), 'exp')   # output data value: use builtin math.exp(x)
        
        # recall downstream grad = local grad * upstream grad
        # local gradient for exp: d(e^x)/dx = e^x (i.e. out.data, just calculated!)
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        
        return out
    
    # define division method
 
    def backward(self):
        topo = []
        visited = set()
        
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()
 
# helper: re-initialise graph
def reset_graph(reset_level):
 
    # declare global gradients
    global x1, x2, w1, w2, x1w1, x2w2, x1w1x2w2, b, n, o
 
    if reset_level == 'gradients':
        x1.grad = x2.grad = w1.grad = w2.grad = x1w1.grad = x2w2.grad = x1w1x2w2.grad = b.grad = n.grad = o.grad = 0
 
        print("reset_graph(): All gradients have been reset to 0")
 
    # reset all variables
    elif reset_level == 'graph':
        # redefine inputs (x1,x2), weights (w1,w2), and then the graph (n = x1*w1 + x2*w2 + b)
        x1 = Value(2.0, label='x1'); x2 = Value(0.0, label='x2')
        w1 = Value(-3.0, label='w1'); w2 = Value(1.0, label='w2')
        x1w1 = x1 * w1; x1w1.label = 'x1*w1'; x2w2 = x2 * w2; x2w2.label = 'x2*w2'
        x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
 
        # manually change bias to make number nice for education: (b=8 to see tanh squishing!, b=6.8813735870195432 so deriv = 1)
        b = Value(6.8813735870195432, label='b'); 
        n = x1w1x2w2 + b; n.label = 'n'
 
        # try re-run the activation function on n (the raw cell body) and draw the output node o
        o = n.tanh(); o.label = 'o'
 
        print("reset_graph(): All vars, initial and intermediate, have been reset. All gradients now 0")
 
    else: print("reset_graph(): please specify the level of reset desired 'gradients' or 'graph'")
 
# graphviz
from graphviz import Digraph
 
def trace(root):
    # recursively builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes: 
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges
 
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        # connect n_i to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot

PyTorch Neuron, Layer, MLP class definitions, unchanged from previous notebook:

# PyTorch Neuron, Layer, MLP classes
class Neuron:
    # nin: number of inputs to the neuron
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]  # create 1 weight per input 
        self.b = Value(random.uniform(-1,1))                        # create 1 bias for the neuron
    
    # function call [e.g. n(x)] returns forward pass of this neuron (its post-activation value)
    def __call__(self, x):
        # pre-activation weighted input: act = (w ⋅ x) + b. NB: w * x is their dot product
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()    # output: after applying activitation function (non-linearity)
        return out
    
    # for a neuron: list of incoming weights concat w/ bias at that neuron (single element list)
    def parameters(self):
        return self.w + [self.b]
 
class Layer:
    # nout: how many (independent evaluated) neurons in this layer
    def __init__(self, nin, nout):
        # define 1 layer as a list of `nout` `Neuron` objects, EACH being a `nin`-dim Neuron
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    # function calls [e.g. l(x)] independently evaluates the `nout` neurons in this layer
    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs  # strip list if single element return
    
    # for a layer: list containing: 
        # `neuron.parameters`: all params (incoming weights, bias) for each neuron
        # `self.neurons`: for all neurons in that layer
    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]
 
class MLP:
    # nouts: list defining the desired sizes of each layer in the MLP
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        # define each `Layer` object by iterating consecutive pairs of sizes (i and i+1)
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    # function calls returns `Layer` objects sequentially
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    # for full MLP:
        # `layer.parameters`: all params for each layer (i.e. ALL incoming weights, and baises for ALL neurons of that layer)
        # `self.layers`: for all layers in the MLP network
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]
 
# helper function to inspect 1st layer, 1st neuron: data and grad of bias and 1st weight
def inspect_params_l1_n1_w_b(network):
    
    # prints (4x)
    print('1st layer, 1st neuron\'s bias, data:', network.layers[0].neurons[0].b.data)
    print('1st layer, 1st neuron\'s bias, grad:', network.layers[0].neurons[0].b.grad)
    print('1st layer, 1st neuron, 1st weight, data:', network.layers[0].neurons[0].w[0].data)
    print('1st layer, 1st neuron, 1st weight, grad:', network.layers[0].neurons[0].w[0].grad)

Example binary classifier

Perform backpropagation with PyTorch’s built-in .backward() method.

# o - define binary classifier NN
xs = [                  # dataset with 4 examples
    [2.0, 3.0, -1.0],   # when nn is fed ex. 1 -> desired output: 1.0
    [3.0, -1.0, 0.5],   # when nn is fed ex. 2 -> desired output: -1.0
    [0.5, 1.0, 1.0],    # when nn is fed ex. 3 -> desired output: -1.0
    [1.0, 1.0, -1.0],   # when nn is fed ex. 4 -> desired output: 1.0
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets
 
# initialise neural network (MLP)
nn = MLP(3, [4, 4, 1])
 
# what nn currently outputs for these 4 examples
ypred = [nn(x) for x in xs]
 
print("For the 4 examples `xs`, we...:")
print("have actual nn outputs `ypred` -> want desired outputs `ys`")
for y_pred, y in zip(ypred, ys):
    print(f"{y_pred.data} -> {y}")
 
For the 4 examples `xs`, we...:
have actual nn outputs `ypred` -> want desired outputs `ys`
-0.030669685297522595 -> 1.0
0.618279041630556 -> -1.0
0.3365212611340581 -> -1.0
0.7533370832196005 -> 1.0

Inspect above output

Implement MSE (L2) loss function

  • Predicted outputs ypred (yout) are quite different from desired outputs ys (ygt).
  • The loss is a single number that measures the total performance of the neural network.
  • We use the mean squared error (MSE) loss function:

  • where
    • : mean squared error (loss function)
    • : number of data points
    • : observed values (ys, or -ground truth: ygt)
    • : predicted values (ypred, or output: yout)

If , (i.e. ygt = yout), then the . Otherwise, . The larger the difference the greater the is.

# o - calculated individual loss (squared error) components for each data point:
print('individual loss (squared error) component for each data point (bigger error = worse!):')
print(*[err.data for err in [(yout - ygt)**2 for ygt, yout in zip(ys, ypred)]], sep='\n')
 
print('\ncompare individual loss components with previous block\'s outputs \n(see size of difference between each element of `ypred` and corresponding element of `ys`)\n\n---')
 
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print('\ntotal loss (MSE):', loss.data)
individual loss (squared error) component for each data point (bigger error = worse!):
1.0622800001912942
2.6188270565807112
1.7862890814633734
0.0608425945146143
 
compare individual loss components with previous block's outputs 
(see size of difference between each element of `ypred` and corresponding element of `ys`)
 
---
 
total loss (MSE): 5.528238732749994

Inspect above output

Inspecting network parameters

In gradient-descent, a parameter’s grad vector (scalar, in this example) points in the direction of steepest ascent.

Gradient Descent: Improving the loss function

After performing the loss.backward() pass, the grad of all parameter nodes (weights, baises) will change from some other value:

  • If grad , that parameter negatively influences the loss function
    • To improve (minimise) the loss function, increase that parameter’s data value
  • If grad , that parameter positively influences the loss function
    • To improve (minimise) the loss function, decrease that parameter’s data value

Note:

  • grad for input data, xs, are not useful since we cannot change the input data.
  • inputs on all the w and b values are useful to update their data values
# o - inspect a single weight and bias (data value and gradient) before backward pass
print('before backward pass: `loss.backward()`')
inspect_params_l1_n1_w_b(nn)
before backward pass: `loss.backward()`
1st layer, 1st neuron's bias, data: -0.2527039348623852
1st layer, 1st neuron's bias, grad: 0.0
1st layer, 1st neuron, 1st weight, data: 0.5858915688216553
1st layer, 1st neuron, 1st weight, grad: 0.0
# o - perform backward pass on loss "output" node -> re-inspect the same weight and bias AFTER this:
loss.backward()
 
print('after backward pass: `loss.backward()`')
inspect_params_l1_n1_w_b(nn)
after backward pass: `loss.backward()`
1st layer, 1st neuron's bias, data: -0.2527039348623852
1st layer, 1st neuron's bias, grad: -0.6425469767350062
1st layer, 1st neuron, 1st weight, data: 0.5858915688216553
1st layer, 1st neuron, 1st weight, grad: -0.1083991069576411

Inspect both above outputs

# i - HUGE output graph. worth inspecting root node (`loss`) and leaf nodes (input data: `xs`, bias `b`)
# draw_dot(loss)

Inspecting ALL parameters

# i - inspect all parameters in the MLP network
print('no. of parameters:',len(nn.parameters()))
print('\nparameter values:', *nn.parameters(), sep='\n')
no. of parameters: 41
 
parameter values:
Value(data=0.5858915688216553)
Value(data=0.24204639586837384)
Value(data=0.5540617455238359)
Value(data=-0.2527039348623852)
Value(data=-0.0024192363061177335)
Value(data=-0.32365360415300093)
Value(data=-0.12238259702033294)
Value(data=-0.0009319366871884949)
Value(data=0.853482765777527)
Value(data=0.9518014994361543)
Value(data=0.4694005041885234)
Value(data=-0.7151731529707315)
Value(data=-0.4132421662812258)
Value(data=0.8742470275180891)
Value(data=-0.4770288161590237)
Value(data=-0.512392518490292)
Value(data=-0.7214804589607708)
Value(data=0.15524433510895141)
Value(data=-0.606947212375128)
Value(data=0.8057384636851816)
Value(data=0.34423675024790956)
Value(data=0.8911643253773633)
Value(data=-0.3074846830517708)
Value(data=0.674843391644385)
Value(data=0.9803638159760659)
Value(data=-0.9379344688828088)
Value(data=0.026729242517386398)
Value(data=0.5592713428822869)
Value(data=0.36755329819874083)
Value(data=-0.7500473600441895)
Value(data=-0.03331756079480508)
Value(data=0.6999906180870976)
Value(data=0.3953800862639454)
Value(data=0.6509811930800722)
Value(data=0.9173726035761776)
Value(data=0.805925459887916)
Value(data=0.569870544614919)
Value(data=-0.6917547546903493)
Value(data=0.07826376203371344)
Value(data=-0.21450799987280855)
Value(data=0.9705860522133329)

Gradient descent

Minimise the loss function

  • For each parameter (incoming weights, or bias) in the neural network: p in nn.parameters()
  • Change that parameter’s data value p.data slightly, in a direction OPPOSITE to its gradient p.grad
    • i.e. p.data += -0.1 * p.grad
# i - perform 1 iteration of gradient descent (on all parameters in network)
for p in nn.parameters():
    p.data += -0.1 * p.grad
# o - re-inspect the same weight and bias from before, AFTER performing gradient descent:
 
print('after gradient descent (iteration: 1)')
inspect_params_l1_n1_w_b(nn)
after gradient descent (iteration: 1)
1st layer, 1st neuron's bias, data: -0.18844923718888457
1st layer, 1st neuron's bias, grad: -0.6425469767350062
1st layer, 1st neuron, 1st weight, data: 0.5967314795174193
1st layer, 1st neuron, 1st weight, grad: -0.1083991069576411

Manually loop: 3 iterations

Iteration 2

# i - forward pass, inspect loss function value
ypred = [nn(x) for x in xs] # calculate new network values
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred)) # recalculate loss
print('\nforward pass (iter 2): total loss (MSE):', loss.data)
forward pass (iter 2): total loss (MSE): 1.029708597714175
# i - re-initialise all parameter gradients
for p in nn.parameters():
    p.grad = 0.0
# i - backward pass: update gradients on all parameters via backprop
loss.backward()
# i - gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
    p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)
# o - inspect parameters and predicted values
print('after gradient descent (iteration: 2)\nparam values:')
inspect_params_l1_n1_w_b(nn)
print('\npredicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
    print(f"{y_pred.data} -> {y}")
after gradient descent (iteration: 2)
param values:
1st layer, 1st neuron's bias, data: -0.2103971111257778
1st layer, 1st neuron's bias, grad: 0.21947873936893247
1st layer, 1st neuron, 1st weight, data: 0.5318873931345732
1st layer, 1st neuron, 1st weight, grad: 0.6484408638284616
 
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.38199162402373854 -> 1.0
-0.6145309735746732 -> -1.0
-0.34199409598487684 -> -1.0
0.7426750985906923 -> 1.0

Iteration 3

# o - forward pass, inspect loss function value
ypred = [nn(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print('\nforward pass (iter 3): total loss (MSE):', loss.data)
forward pass (iter 3): total loss (MSE): 0.36594039334941736
# re-initialise all parameter gradients
for p in nn.parameters():
    p.grad = 0.0
# backward pass: update gradients on all parameters via backprop
loss.backward()
# gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
    p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)
# o - inspect parameters and predicted values
print('after gradient descent (iteration: 3)')
inspect_params_l1_n1_w_b(nn)
print('\npredicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
    print(f"{y_pred.data} -> {y}")
after gradient descent (iteration: 3)
1st layer, 1st neuron's bias, data: -0.2169528627439334
1st layer, 1st neuron's bias, grad: 0.0655575161815562
1st layer, 1st neuron, 1st weight, data: 0.49980160109216426
1st layer, 1st neuron, 1st weight, grad: 0.32085792042408956
 
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.6674766302017127 -> 1.0
-0.7760980798078445 -> -1.0
-0.5706257617365379 -> -1.0
0.8555206051461104 -> 1.0

Iteration 4

# o - forward pass, inspect loss function value
ypred = [nn(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print('\nforward pass (iter 4): total loss (MSE):', loss.data)
forward pass (iter 4): total loss (MSE): 0.18176975167171786
# re-initialise all parameter gradients
for p in nn.parameters():
    p.grad = 0.0
# backward pass: update gradients on all parameters via backprop
loss.backward()
# gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
    p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)
# o - inspect parameters and predicted values
print('after gradient descent (iteration: 4)')
inspect_params_l1_n1_w_b(nn)
print('\npredicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
    print(f"{y_pred.data} -> {y}")
after gradient descent (iteration: 4)
1st layer, 1st neuron's bias, data: -0.22526972514243898
1st layer, 1st neuron's bias, grad: 0.0831686239850558
1st layer, 1st neuron, 1st weight, data: 0.4749438078763519
1st layer, 1st neuron, 1st weight, grad: 0.2485779321581236
 
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.7425864308571539 -> 1.0
-0.8457322647516171 -> -1.0
-0.7213795164363924 -> -1.0
0.8813404108679985 -> 1.0

Automate loop: 20 iterations

# i - perform 20 more gradient descent iterations (for all 41 network parameters)
for k in range(20):
    # forward pass, with mean square error (MSE) loss 
    ypred = [nn(x) for x in xs]
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
    
    # re-initialise all parameter gradients
    for p in nn.parameters():
        p.grad = 0.0
    
    # backward pass: update gradients on all parameters via backprop
    loss.backward()
    
    # gradient descent: nudge (0.1) all params in direction of steepest descent
    for p in nn.parameters():
        p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)
    
    print(k, loss.data)
 
0 0.11852673045381416
1 0.0880052700087189
2 0.06987862296212292
3 0.057878823177831724
4 0.04935460171584655
5 0.0429905228649674
6 0.03805999662401936
7 0.03412907715440516
8 0.03092268619290014
9 0.02825805578160131
10 0.026009033488036827
11 0.024085754486472596
12 0.022422480881888758
13 0.020970021409476274
14 0.01969083836718791
15 0.01855579283222802
16 0.01754192231835979
17 0.016630888154989535
18 0.01580786849680682
19 0.015060754626918213
# o - inspect network's predictions: ypred
print('predicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
    print(f"{y_pred.data} -> {y}")
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.9327166974173132 -> 1.0
-0.9544386424494977 -> -1.0
-0.9190363251214165 -> -1.0
0.9563793871111931 -> 1.0

Inspect above outputs

Takeaways

Neural network, forward pass, loss function, backward pass, gradient descent

  • Neural network: A mathematical expression (graph) with the following structure:
    • input data (e.g. input layer neurons, each with a value),
    • input parameters (full network’s weights, biases)
    • hidden and output layer neurons (each with activation values)
  • Forward pass: Mathematical expression computing the value at each neuron
    • based on the input data, and parameters on that iteration
  • Loss function: Accuracy of the network’s predictions (final layer outputs in NN).
    • Goal: minimise the loss function
    • Low loss means the predictions ypreds are better matching the targets yout
  • Backward pass: Use backpropagation to update each parameter’s gradient
  • Gradient descent: Tune all parameters to iteratively decrease the loss locally (minimise loss function)
  • Repeat for N iterations

Sources