# imports, `Value` class, graphviz: trace() & draw_dot()
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
# Object definitions from end of previous chapter:
 
# Value class:
class Value:
 
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
 
# graphviz
from graphviz import Digraph
 
def trace(root):
    # recursively builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes: 
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges
 
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        # connect n_i to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot
 
# draw_dot(L)

8. Manual backpropagation (train a neuron)

Inspiring example: MLP and a single neuron

An example neural network (MLP): Example NN

A mathematical model of a single neuron in MLP. Note the multiplicative relationship between input and weight (synapse): Mathematical model of neuron

  • The activation-function is a squishing function (e.g. Sigmoid, ReLU, GELU)
    • In this example, is the hyperbolic tan function, :
# example activation function: tanh smoothly caps large (+ or -) inputs to +1 or -1 respectively
plt.figure(figsize=(4, 3), dpi=80)
plt.plot(np.arange(-5,5,0.2), np.tanh(np.arange(-5,5,0.2))); plt.grid()
plot

8.1. Define the forward pass (i.e. initialise NN)

  • Initialise:
    • neuron inputs (data): and
    • weights: and
    • bias:
  • Then compute the neuron’s pre-activation value:
  • Visualise the computation graph as a DAG with graphviz):
# init nn: params x1,x2; w1,w2; b -> intermediate nodes x1w1, x2w2, x1w1x2w2 -> output (n)
# neuron inputs x1,x2 (2 dimensional neuron)
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
 
# weights of neuron w1,w2 (synaptic strengths for each input)
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
 
# bias of the neuron
b = Value(6.7, label='b')
 
# following the graph above to create: x1*w1 + x2*w2 + b
x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
 
# cell body raw activation (without the activation function)
n = x1w1x2w2 + b; n.label = 'n'
 
draw_dot(n)
4539256336 w1 data -3.0000 grad 0.0000 4538802992* * 4539256336->4538802992* 4538885776 x1*w1 + x2*w2 data -6.0000 grad 0.0000 4538710352+ + 4538885776->4538710352+ 4538885776+ + 4538885776+->4538885776 4539256976 x2 data 0.0000 grad 0.0000 4537791312* * 4539256976->4537791312* 4539191984 x1 data 2.0000 grad 0.0000 4539191984->4538802992* 4539164848 w2 data 1.0000 grad 0.0000 4539164848->4537791312* 4538802992 x1*w1 data -6.0000 grad 0.0000 4538802992->4538885776+ 4538802992*->4538802992 4538710352 n data 0.7000 grad 0.0000 4538710352+->4538710352 4537791312 x2*w2 data 0.0000 grad 0.0000 4537791312->4538885776+ 4537791312*->4537791312 4539164544 b data 6.7000 grad 0.0000 4539164544->4538710352+

8.2. Define the activation function in Value

The cell below (calculating output via activation function tanh) throws an error.

  • is not defined in Value
  • Hyperbolic functions cannot be computed via Value object’s methods we defined earlier
    • __add__ (+) and __mul__ (*) are insufficient
    • Division and/or exponentiation is also needed.
# i - output axon (via activation function tanh) -- THROWS ERROR!
o = n.tanh() # throws error since Python doesn't know how to do tanh for a Value object
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[4], line 2
      1 # i - output axon (via activation function tanh) -- THROWS ERROR!
----> 2 o = n.tanh() # throws error since Python doesn't know how to do tanh for a Value object
 
AttributeError: 'Value' object has no attribute 'tanh'
  • We could implement the ideas of __div__() and exp() as new methods into our Value object, and then reproduce the operator
  • However we can also directly define as the tanh method, as long as we know how to take its local derivative
    • Any arbitrarily complicated function can be directly defined in Value, if we know how to take its local derivative (how its inputs impact its output)

Now we can compute the neuron’s post-activation value (and visualise with graphviz):

# extend `Value` class with `tanh(self)` method; reset network (slightly modify bias `b`); visualise
class Value:
 
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
 
    # defining the tanh method (our activation function) in one go!
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        
        # the tanh node only has 1 child, so it's a tuple of 1 node "(self, )", 
        # and op name is 'tanh'
        out = Value(t, (self, ), 'tanh')
        return out
 
# same values as earlier: define inputs (x1,x2), weights (w1,w2)
x1 = Value(2.0, label='x1'); x2 = Value(0.0, label='x2')
w1 = Value(-3.0, label='w1'); w2 = Value(1.0, label='w2')
x1w1 = x1 * w1; x1w1.label = 'x1*w1'; x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
 
# manually change bias to make number nice for education: 
# b=8 to see tanh squishing the post-activation value, o, to just below +1, 
# b=6.8813735870195432 makes derivative = 1
b = Value(6.8813735870195432, label='b')
n = x1w1x2w2 + b; n.label = 'n'
 
# try re-run the activation function on n (the raw cell body) and draw the output node o
o = n.tanh(); o.label = 'o'
draw_dot(o)
4539200048 x1 data 2.0000 grad 0.0000 4539169712* * 4539200048->4539169712* 4538710608 n data 0.8814 grad 0.0000 4538710864tanh tanh 4538710608->4538710864tanh 4538710608+ + 4538710608+->4538710608 4546609232 x2 data 0.0000 grad 0.0000 4540862192* * 4546609232->4540862192* 4546379856 b data 6.8814 grad 0.0000 4546379856->4538710608+ 4539170016 w2 data 1.0000 grad 0.0000 4539170016->4540862192* 4540862192 x2*w2 data 0.0000 grad 0.0000 4538892032+ + 4540862192->4538892032+ 4540862192*->4540862192 4538892032 x1*w1 + x2*w2 data -6.0000 grad 0.0000 4538892032->4538710608+ 4538892032+->4538892032 4538710864 o data 0.7071 grad 0.0000 4538710864tanh->4538710864 4546609552 w1 data -3.0000 grad 0.0000 4546609552->4539169712* 4539169712 x1*w1 data -6.0000 grad 0.0000 4539169712->4538892032+ 4539169712*->4539169712

8.3. Now the backward pass (backpropagation)

  • We’re particularly interested in and
    • We can only change the weights, and during training of the neural net.
    • The data, and is fixed.
  • Also note, this is only 1 neuron. A real NN has many connected neurons
    • The loss function evaluates to a single number, at the very end of that NN
    • It measures the NN’s accuracy (a goalpost for the NN’s backpropagation)

8.3.1 Manually backpropagate (hand-assign gradients of prior nodes)

See the image below. Immensely helpful.

  1. Base case is known:
  1. Per wikipedia (many options):
  • We know , so by substitution:
  1. is “distributed” (plus node, +) to ‘s upstream nodes and :
  1. is again distributed (plus node, +) to ‘s upstream nodes and :
  1. Finally, for the last two * nodes, gradient propagates upstream via multiplication by the other node’s value.
  • For brevity, only showing gradients for the weights, and , since input data, and , cannot be changed during training.
# backpropagation: hand-assign gradients, `.grad`, of prior nodes
# base case: manually set o.grad (i.e. d(o)/do = 1 is known) 
o.grad = 1.0
 
# we know o = tanh(n); 
# per wikipedia (or calculus): d()/dx tanh(x) = 1 - ( tanh(x))^2;
# therefore: do/dn = 1 - tanh(n)**2 (and we know tanh(n) is o.data!)
n.grad = 1 - o.data**2
 
n.grad # (0.5 in this ex.)
 
# n's incoming nodes enter via a '+' node, so n's gradient is simply routed back (i.e. 0.5 again):
x1w1x2w2.grad = n.grad
b.grad = n.grad 
 
# same logic for x1w1x2w2's incoming nodes (another '+' node); route x1w1x2w2 gradient!
x1w1.grad = x1w1x2w2.grad
x2w2.grad = x1w1x2w2.grad 
 
# the final 4 nodes (x1, w1, x2, w2) flow thru a '*' node. Per the (local) CHAIN RULE, their local gradients are:
x1.grad = x1w1.grad * w1.data # do/dx1 = do/dx1w1 * d(x1w1)/dx1 = x1w1.grad * w1.data = 0.5 * -3 = -1.5
w1.grad = x1w1.grad * x1.data
x2.grad = x2w2.grad * w2.data
w2.grad = x2w2.grad * x2.data
 
draw_dot(o)
4539200048 x1 data 2.0000 grad -1.5000 4539169712* * 4539200048->4539169712* 4538710608 n data 0.8814 grad 0.5000 4538710864tanh tanh 4538710608->4538710864tanh 4538710608+ + 4538710608+->4538710608 4546609232 x2 data 0.0000 grad 0.5000 4540862192* * 4546609232->4540862192* 4546379856 b data 6.8814 grad 0.5000 4546379856->4538710608+ 4539170016 w2 data 1.0000 grad 0.0000 4539170016->4540862192* 4540862192 x2*w2 data 0.0000 grad 0.5000 4538892032+ + 4540862192->4538892032+ 4540862192*->4540862192 4538892032 x1*w1 + x2*w2 data -6.0000 grad 0.5000 4538892032->4538710608+ 4538892032+->4538892032 4538710864 o data 0.7071 grad 1.0000 4538710864tanh->4538710864 4546609552 w1 data -3.0000 grad 1.0000 4546609552->4539169712* 4539169712 x1*w1 data -6.0000 grad 0.5000 4539169712->4538892032+ 4539169712*->4539169712

8.3.1 Analysis of this backpropagation graph

  • Note how w2.grad = 0 in the graph above.
  • This makes sense because w2.grad tells us how nudging w2 affects the final output o
    • Since x2 (the neuron input) is 0; it doesn’t matter how we nudge w2. o remains unchanged (because x2*w2 go through a * node)
    • So o is totally insensitive to w2, hence w2.grad = 0

Next: Automate backprop - Implement _backward method

_backward implements the chain rule locally at each node and passes it to upstream nodes

  • Leaf node (e.g. input data / input layer weights): Do nothing.
    • Why: No downstream nodes to pass gradient to
  • Addition node (+): Incoming upstream gradient distributed as-is
    • Why: local gradient is 1.0 so it passes downstream, unchanged
  • Multiplication node (*): Incoming upstream gradient swap multiplier
    • Why: local gradient multiplied by the value of the other input, then passed downstream
  • Activation functions (e.g. ReLU, Sigmoid): Multiply by derivative of activation function
  • Maximum node: Switch behaviour - 100% routed to largest input, 0 to others.

Note: See backprop-graph-terminology

Sources